library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(boot)
library(ggplot2)
library(readxl)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(stringr)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
data <- read_excel('data/Sample_Customore_Order raw.xlsx', sheet = 'raw')
glimpse(data)
## Rows: 61,728
## Columns: 11
## $ customer_unique_id <dbl> 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14…
## $ order_id <chr> "A000000001", "A000000002", "A000000003", "A0000…
## $ item_quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ created_day <dttm> 2019-07-01, 2019-07-01, 2019-07-01, 2019-07-01,…
## $ onsite_original_price <dbl> 338000, 175000, 520000, 238000, 159000, 135000, …
## $ selling_price <dbl> 259000, 139000, 359000, 185000, 129000, 105000, …
## $ shipping_fee <dbl> 3000, 0, 14927, 29645, 14927, 0, 3000, 3000, 402…
## $ voucher_platform <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ voucher_seller <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ payment_method <chr> "Airpay GIRO", "Cash on Delivery", "Cybersource"…
## $ order_status <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETE…
duplicate_rows <- data %>% dplyr::filter(duplicated(.))
# View the duplicate rows
glimpse(duplicate_rows)
## Rows: 1,897
## Columns: 11
## $ customer_unique_id <dbl> 26, 54, 84, 92, 137, 156, 214, 240, 341, 483, 64…
## $ order_id <chr> "A000000027", "A000000055", "A000000086", "A0000…
## $ item_quantity <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ created_day <dttm> 2019-07-01, 2019-07-01, 2019-07-01, 2019-07-01,…
## $ onsite_original_price <dbl> 215000, 7000, 149000, 370000, 298000, 59000, 175…
## $ selling_price <dbl> 179000, 7000, 119000, 289000, 259000, 45000, 149…
## $ shipping_fee <dbl> 39974, 39000, 10500, 46426, 51425, 0, 0, 0, 0, 0…
## $ voucher_platform <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ voucher_seller <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ payment_method <chr> "Cash on Delivery", "Cash on Delivery", "Cash on…
## $ order_status <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETE…
data <- data |> distinct()
glimpse(data)
## Rows: 59,831
## Columns: 11
## $ customer_unique_id <dbl> 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14…
## $ order_id <chr> "A000000001", "A000000002", "A000000003", "A0000…
## $ item_quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ created_day <dttm> 2019-07-01, 2019-07-01, 2019-07-01, 2019-07-01,…
## $ onsite_original_price <dbl> 338000, 175000, 520000, 238000, 159000, 135000, …
## $ selling_price <dbl> 259000, 139000, 359000, 185000, 129000, 105000, …
## $ shipping_fee <dbl> 3000, 0, 14927, 29645, 14927, 0, 3000, 3000, 402…
## $ voucher_platform <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ voucher_seller <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ payment_method <chr> "Airpay GIRO", "Cash on Delivery", "Cybersource"…
## $ order_status <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETE…
data_total_price <- data |> mutate(total_price = selling_price + shipping_fee,
.before = 1)
data_sum <- data_total_price
data_sum <- data_sum |> group_by(created_day) |> summarise(total_price = sum(total_price))
glimpse(data_sum)
## Rows: 27
## Columns: 2
## $ created_day <dttm> 2019-07-01, 2019-07-02, 2019-07-03, 2019-07-04, 2019-07-0…
## $ total_price <dbl> 146796201, 134045071, 176594494, 130569104, 177917512, 422…
highlight_date <- as.Date(c("2019-07-12", "2019-07-21"))
# Create the ggplot2 object
p <- ggplot(data_sum, aes(x=created_day, y=total_price)) +
geom_line(color="steelblue") +
geom_point(data = subset(data_sum, created_day %in% highlight_date),
aes(x = created_day, y = total_price),
color = "red", size = 2, shape = 21, fill = "red", show.legend = TRUE) +
xlab("") +
geom_text(data = subset(data_sum, created_day %in% highlight_date),
aes(x = created_day, y = total_price,
label = paste("\n\nDate:", created_day, " Total:", total_price)),
vjust = -1.5, hjust = 1.1, color = "black", size = 3) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
p <- ggplotly(p)
p
data_mean <- data_total_price
data_mean <- data_mean |> group_by(created_day) |> summarise(mean_price = mean(total_price))
glimpse(data_mean)
## Rows: 27
## Columns: 2
## $ created_day <dttm> 2019-07-01, 2019-07-02, 2019-07-03, 2019-07-04, 2019-07-0…
## $ mean_price <dbl> 235627.9, 245055.0, 229641.7, 232329.4, 224077.5, 227207.6…
p <- ggplot(data_mean, aes(x=created_day, y=mean_price)) +
geom_line( color="steelblue") +
geom_point(data = subset(data_mean, created_day %in% highlight_date),
aes(x = created_day, y = mean_price),
color = "red", size = 2, shape = 21, fill = "red") +
xlab("") +
geom_text(data = subset(data_mean, created_day %in% highlight_date),
aes(x = created_day, y = mean_price,
label = paste("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDate:", created_day, "\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMean:", round(mean_price))),
vjust = -1.5, hjust = 1.1, color = "black", size = 3) +
theme(axis.text.x=element_text(angle=60, hjust=1))
p <- ggplotly(p)
p
data_factor <- data
data_factor$payment_method <- factor(data_factor$payment_method)
data_factor$order_status <- factor(data_factor$order_status)
group_payment_method <- function(method) {
case_when(
str_detect(method, "VN Airpay Ibanking") ~ "VN Airpay Ibanking",
TRUE ~ as.character(method)
)
}
data_factor <- data_factor |> mutate(payment_method = sapply(payment_method, group_payment_method))
data_factor$payment_method <- factor(data_factor$payment_method)
data_factor$item_quantity <- factor(data_factor$item_quantity)
data_factor <- data_factor |> mutate(total_price = selling_price + shipping_fee,
.before = 1)
p <- ggplot_count_plot <- ggplot(data_factor, aes(x = payment_method)) +
geom_bar(fill = "steelblue") +
labs(title = "Count of Payment Methods",
x = "Payment Method",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- ggplotly(p)
p
p <- ggplot_count_plot <- ggplot(data_factor, aes(x = payment_method, fill = order_status)) +
geom_bar() +
scale_fill_manual(values = c("COMPLETED" = "steelblue", "CANCELLED" = "tomato")) +
labs(title = "Count of Payment Methods by Order Status",
x = "Payment Method",
y = "Count",
fill = "Order Status") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- ggplotly(p)
p
data_factor1 <- data_factor |> dplyr::filter(data_factor$payment_method != "Cash on Delivery")
p <- ggplot_count_plot <- ggplot(data_factor1, aes(x = payment_method, fill = order_status)) +
geom_bar() +
scale_fill_manual(values = c("COMPLETED" = "steelblue", "CANCELLED" = "tomato")) +
labs(title = "Count of Payment Methods by Order Status (excluded COD)",
x = "Payment Method",
y = "Count",
fill = "Order Status") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- ggplotly(p)
p
p <- ggplot_count_plot <- ggplot(data_factor, aes(x = item_quantity)) +
geom_bar(fill = "steelblue") +
labs(title = "Count of Item Quantity",
x = "Item Quantity",
y = "Count") +
theme(axis.text.x = element_text(angle = 0, hjust = 1))
p <- ggplotly(p)
p
data_sum_orderid <- data_factor
data_sum_orderid <- data_sum_orderid |> group_by(customer_unique_id, order_id, order_status, payment_method) |> summarise(total_price = sum(total_price), .groups = 'drop')
glimpse(data_sum_orderid)
## Rows: 37,835
## Columns: 5
## $ customer_unique_id <dbl> 1, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 9, 10, 10…
## $ order_id <chr> "A000000001", "A000004553", "A000000002", "A0000000…
## $ order_status <fct> CANCELLED, COMPLETED, COMPLETED, COMPLETED, COMPLET…
## $ payment_method <fct> Airpay GIRO, Airpay GIRO, Cash on Delivery, Cyberso…
## $ total_price <dbl> 262000, 232000, 234000, 373927, 214645, 143927, 150…
data_sum_order_complete <- data_sum_orderid |> dplyr::filter(data_sum_orderid$order_status == "COMPLETED")
# Define the two payment methods to highlight
highlight_methods <- c("Airpay GIRO", "Cash on Delivery")
p <- ggplot(data_sum_order_complete, aes(x = payment_method, y = total_price)) +
geom_jitter(aes(color = ifelse(payment_method %in% highlight_methods, as.character(payment_method), "Other")),
width = 0.2, height = 0) +
scale_color_manual(values = c(
"Airpay GIRO" = "steelblue",
"Cash on Delivery" = "tomato",
"Other" = "grey"
)) +
labs(title = "Scatter Plot of Payment Method vs Revenue",
x = "Payment Method",
y = "Revenue",
color = "Payment Method") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p
data_summary <- data_sum_orderid %>%
group_by(order_status) %>%
summarize(count = n()) %>%
mutate(percentage = count / sum(count) * 100) # Calculate percentage
# Create a pie chart for the ratio of completed to canceled orders with custom colors
p <- ggplot(data_summary, aes(x = "", y = count, fill = order_status)) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta = "y") +
labs(title = "Ratio of Completed to Canceled Orders") +
theme_void() +
theme(legend.title = element_blank()) +
geom_text(aes(label = paste0(round(percentage, 1), "%")),
position = position_stack(vjust = 0.5),
color = "black",
size = 4,
fontface = "bold") +
scale_fill_manual(values = c("COMPLETED" = "steelblue", "CANCELLED" = "tomato"))
# Print the interactive plotly plot
p
data_sum_test <- data_total_price
data_sum_test <- data_sum_test |> group_by(customer_unique_id, order_id, order_status, shipping_fee) |> summarise(shipping_fee = unique(shipping_fee), .groups = 'drop')
glimpse(data_sum_test)
## Rows: 37,835
## Columns: 4
## $ customer_unique_id <dbl> 1, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 9, 10, 10…
## $ order_id <chr> "A000000001", "A000004553", "A000000002", "A0000000…
## $ order_status <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee <dbl> 3000, 3000, 0, 14927, 29645, 14927, 0, 3000, 40297,…
data_sum_order_complete_test <- data_sum_test |> dplyr::filter(data_sum_test$order_status == "COMPLETED")
glimpse(data_sum_order_complete_test)
## Rows: 30,273
## Columns: 4
## $ customer_unique_id <dbl> 1, 2, 3, 4, 6, 7, 8, 8, 8, 8, 8, 10, 10, 11, 15, 16…
## $ order_id <chr> "A000004553", "A000000002", "A000000003", "A0000000…
## $ order_status <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee <dbl> 3000, 0, 14927, 29645, 0, 3000, 40297, 0, 35641, 63…
data_sum_order_cancel_test <- data_sum_test |> dplyr::filter(data_sum_test$order_status == "CANCELLED")
glimpse(data_sum_order_cancel_test)
## Rows: 7,562
## Columns: 4
## $ customer_unique_id <dbl> 1, 5, 8, 9, 11, 12, 12, 12, 13, 14, 19, 26, 34, 41,…
## $ order_id <chr> "A000000001", "A000000005", "A000000008", "A0000000…
## $ order_status <chr> "CANCELLED", "CANCELLED", "CANCELLED", "CANCELLED",…
## $ shipping_fee <dbl> 3000, 14927, 40297, 0, 29980, 32100, 32150, 32100, …
data_sum_test |> dplyr::filter(data_sum_test$customer_unique_id == 8)
## # A tibble: 6 × 4
## customer_unique_id order_id order_status shipping_fee
## <dbl> <chr> <chr> <dbl>
## 1 8 A000000008 CANCELLED 40297
## 2 8 A000000318 COMPLETED 40297
## 3 8 A000000668 COMPLETED 0
## 4 8 A000002917 COMPLETED 35641
## 5 8 A000014714 COMPLETED 63045
## 6 8 A000031049 COMPLETED 500
ab_test <- merge(data_sum_order_complete_test, data_sum_order_cancel_test, by = "customer_unique_id")
glimpse(ab_test)
## Rows: 4,788
## Columns: 7
## $ customer_unique_id <dbl> 1, 8, 8, 8, 8, 8, 11, 19, 26, 34, 41, 59, 66, 66, 6…
## $ order_id.x <chr> "A000004553", "A000014714", "A000000318", "A0000310…
## $ order_status.x <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee.x <dbl> 3000, 63045, 40297, 500, 0, 35641, 0, 3000, 3000, 0…
## $ order_id.y <chr> "A000000001", "A000000008", "A000000008", "A0000000…
## $ order_status.y <chr> "CANCELLED", "CANCELLED", "CANCELLED", "CANCELLED",…
## $ shipping_fee.y <dbl> 3000, 40297, 40297, 40297, 40297, 40297, 29980, 300…
test1_t <- ab_test
test2_t <- ab_test
test1_t[5:7] <- NULL
glimpse(test1_t)
## Rows: 4,788
## Columns: 4
## $ customer_unique_id <dbl> 1, 8, 8, 8, 8, 8, 11, 19, 26, 34, 41, 59, 66, 66, 6…
## $ order_id.x <chr> "A000004553", "A000014714", "A000000318", "A0000310…
## $ order_status.x <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee.x <dbl> 3000, 63045, 40297, 500, 0, 35641, 0, 3000, 3000, 0…
test2_t[2:4] <- NULL
glimpse(test2_t)
## Rows: 4,788
## Columns: 4
## $ customer_unique_id <dbl> 1, 8, 8, 8, 8, 8, 11, 19, 26, 34, 41, 59, 66, 66, 6…
## $ order_id.y <chr> "A000000001", "A000000008", "A000000008", "A0000000…
## $ order_status.y <chr> "CANCELLED", "CANCELLED", "CANCELLED", "CANCELLED",…
## $ shipping_fee.y <dbl> 3000, 40297, 40297, 40297, 40297, 40297, 29980, 300…
We consider the question: “Does the shipping fee of an order influence whether a customer accepts or cancels it?”
We will focus on customers who meet both of the following conditions: “have canceled at least one order” and “have completed at least one order.”
test1_t <- test1_t |> distinct()
test2_t <- test2_t |> distinct()
test1_sum_t <- test1_t |> group_by(customer_unique_id) |> summarise(shipping_fee.x = sum(shipping_fee.x))
test2_sum_t <- test2_t |> group_by(customer_unique_id) |> summarise(shipping_fee.y = sum(shipping_fee.y))
ab_tess_1 <- merge(test1_sum_t, test2_sum_t, by = "customer_unique_id")
ab_tess_1 <- ab_tess_1 |> rename(shipping_fee_completed = shipping_fee.x) |> rename(shipping_fee_cancelled = shipping_fee.y)
glimpse(ab_tess_1)
## Rows: 3,004
## Columns: 3
## $ customer_unique_id <dbl> 1, 8, 11, 19, 26, 34, 41, 59, 66, 67, 68, 73, 9…
## $ shipping_fee_completed <dbl> 3000, 139483, 0, 3000, 3000, 0, 28659, 0, 2000,…
## $ shipping_fee_cancelled <dbl> 3000, 40297, 29980, 3000, 39974, 16050, 28659, …
mean_complete <- mean(ab_tess_1$shipping_fee_completed )
mean_cancel <- mean(ab_tess_1$shipping_fee_cancelled)
print(sprintf("Mean of price for completed order: %.10f", mean_complete))
## [1] "Mean of price for completed order: 12897.2749667111"
print(sprintf("Mean of price for cancelled order: %.10f", mean_cancel))
## [1] "Mean of price for cancelled order: 16496.6394806924"
data_plot <- tibble(
order_status = c("Completed", "Canceled"),
mean_shipping_fee = c(mean_complete, mean_cancel)
)
# Draw a bar plot using ggplot
p <- ggplot(data_plot, aes(x = order_status, y = mean_shipping_fee, fill = order_status)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Mean Shipping Fee by Order Status (for a selected group of customer)",
x = "Order Status",
y = "Mean Shipping Fee") +
theme_minimal() +
scale_fill_manual(values = c("Completed" = "steelblue", "Canceled" = "tomato")) +
theme(text = element_text(size = 10, face = "bold"))
# Print the plot
print(p)
For the selected customers, we observe that the average shipping fee for completed orders is lower than that of the orders they canceled.
A hypothesis can be proposed: “The shipping fee for canceled orders is higher than that for successful orders.” Therefore, we need to test the following null and alternative hypotheses:
Null hypothesis: \(H_0: \mu_0 = \mu_1\) Alternative hypothesis: \(H_1: \mu_1 < \mu_0\)
Where \(\mu_0\) is the average shipping fee of canceled orders, and \(\mu_1\) is the average shipping fee of completed orders.
If \(H_0\) is true, the difference in shipping fees between successful and canceled orders is purely a result of chance and is not statistically significant. To test this hypothesis, we will use a permutation test, and the p-value will be calculated for the left-tailed test.
perm_fun <- function(x1, x0, R)
{
n1 <- length(x1)
n0 <- length(x0)
n <- n1 + n0
mean_diff <- numeric(R)
combined_data <- c(x1, x0)
for (i in 1:R) {
idx_1 <- sample(x = 1:n, size = n1)
idx_0 <- setdiff(1:n, idx_1)
mean_diff[i] <- mean(combined_data[idx_1]) - mean(combined_data[idx_0])
}
return(mean_diff)
}
x1 <- ab_tess_1$shipping_fee_completed
x0 <- ab_tess_1$shipping_fee_cancelled
# Set the number of permutations
R <- 10000
# Run the permutation test
set.seed(42)
mean_diffs <- perm_fun(x1, x0, R)
ggplot(data = tibble(perm_diffs = mean_diffs), aes(x = perm_diffs)) +
geom_histogram(bins = 10, fill = "gray80", color = "black") +
labs(x = "Prices differences", y = "Frequency") +
theme_bw()
result <- mean(mean_diffs < (mean_complete - mean_cancel))
# Print the result
print(result)
## [1] 0
Since the p-value = 0 is smaller than both significance levels of 0.05 and 0.01, we reject the null hypothesis. This indicates that there is strong statistical evidence to conclude that the difference in shipping fees between completed and canceled orders is not due to random chance.
Given the statistical significance, it suggests that higher shipping fees are indeed associated with a higher likelihood of order cancellation. In other words, the evidence supports the idea that shipping fees impact whether customers complete or cancel their orders.
traffic <- read_excel('data/Sample_Customore_Traffic raw.xlsx')
glimpse(traffic)
## Rows: 198
## Columns: 10
## $ `Source / Medium` <chr> "google / cpc", "youtube / social", "(dire…
## $ Users <dbl> 407950, 77785, 64653, 48721, 27718, 14373,…
## $ `New Users` <dbl> 344502, 55537, 60181, 32781, 17774, 10110,…
## $ Sessions <dbl> 723208, 163447, 108534, 176662, 42464, 176…
## $ `Bounce Rate` <dbl> 0.6219013, 0.7881148, 0.5532644, 0.8440015…
## $ `Pages / Session` <dbl> 3.556041, 2.244324, 4.120340, 1.858996, 2.…
## $ `Avg. Session Duration` <dbl> 176.02568, 89.94474, 200.42917, 73.20609, …
## $ `Ecommerce Conversion Rate` <dbl> 0.009078992, 0.005855109, 0.009655960, 0.0…
## $ Transactions <dbl> 6566, 957, 1048, 500, 508, 127, 103, 43, 7…
## $ Revenue <dbl> 6656088123.8, 842907936.0, 1040851446.9, 5…
traffic <- traffic |> janitor::clean_names()
traffic <- na.omit(traffic)
glimpse(traffic)
## Rows: 197
## Columns: 10
## $ source_medium <chr> "google / cpc", "youtube / social", "(direct…
## $ users <dbl> 407950, 77785, 64653, 48721, 27718, 14373, 3…
## $ new_users <dbl> 344502, 55537, 60181, 32781, 17774, 10110, 1…
## $ sessions <dbl> 723208, 163447, 108534, 176662, 42464, 17631…
## $ bounce_rate <dbl> 0.6219013, 0.7881148, 0.5532644, 0.8440015, …
## $ pages_session <dbl> 3.556041, 2.244324, 4.120340, 1.858996, 2.98…
## $ avg_session_duration <dbl> 176.02568, 89.94474, 200.42917, 73.20609, 14…
## $ ecommerce_conversion_rate <dbl> 0.009078992, 0.005855109, 0.009655960, 0.002…
## $ transactions <dbl> 6566, 957, 1048, 500, 508, 127, 103, 43, 7, …
## $ revenue <dbl> 6656088123.8, 842907936.0, 1040851446.9, 534…
traffic_num <- traffic[ -c(1) ]
cor_matrix <- cor(traffic_num)
melted_cor <- melt(cor_matrix)
ggplot(melted_cor, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
geom_text(aes(label=round(value, 2)), color="black", size=3) +
scale_fill_gradient2(low = "tomato", high = "tomato", mid = "white", midpoint = 0,
name = "Correlation") +
theme_minimal() +
labs(x = "Variables", y = "Variables", title = "Correlation Plot") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
traffic1 <- traffic |> mutate(percent_of_new_user = (new_users / users),
.before = 4)
glimpse(traffic1)
## Rows: 197
## Columns: 11
## $ source_medium <chr> "google / cpc", "youtube / social", "(direct…
## $ users <dbl> 407950, 77785, 64653, 48721, 27718, 14373, 3…
## $ new_users <dbl> 344502, 55537, 60181, 32781, 17774, 10110, 1…
## $ percent_of_new_user <dbl> 0.8444711, 0.7139808, 0.9308307, 0.6728310, …
## $ sessions <dbl> 723208, 163447, 108534, 176662, 42464, 17631…
## $ bounce_rate <dbl> 0.6219013, 0.7881148, 0.5532644, 0.8440015, …
## $ pages_session <dbl> 3.556041, 2.244324, 4.120340, 1.858996, 2.98…
## $ avg_session_duration <dbl> 176.02568, 89.94474, 200.42917, 73.20609, 14…
## $ ecommerce_conversion_rate <dbl> 0.009078992, 0.005855109, 0.009655960, 0.002…
## $ transactions <dbl> 6566, 957, 1048, 500, 508, 127, 103, 43, 7, …
## $ revenue <dbl> 6656088123.8, 842907936.0, 1040851446.9, 534…
traffic1 <- traffic1[order(-traffic1$revenue), ][1:30,]
top_10_users <- traffic1[1:10,]
# Display the result
print(top_10_users)
## # A tibble: 10 × 11
## source_medium users new_users percent_of_new_user sessions bounce_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 google / cpc 407950 344502 0.844 723208 0.622
## 2 (direct) / (none) 64653 60181 0.931 108534 0.553
## 3 youtube / social 77785 55537 0.714 163447 0.788
## 4 facebook / social 48721 32781 0.673 176662 0.844
## 5 youtube.com / refe… 27718 17774 0.641 42464 0.639
## 6 l.facebook.com / r… 3661 1712 0.468 7144 0.371
## 7 m.facebook.com / r… 14373 10110 0.703 17631 0.700
## 8 newsletter / email 781 393 0.503 2168 0.405
## 9 zalo / zalo 2782 2400 0.863 4029 0.668
## 10 facebook.com / ref… 1594 694 0.435 2557 0.496
## # ℹ 5 more variables: pages_session <dbl>, avg_session_duration <dbl>,
## # ecommerce_conversion_rate <dbl>, transactions <dbl>, revenue <dbl>
highlight_sources <- c("google / cpc", "(direct) / (none)")
# Create a new column to highlight only selected sources
top_10_users$highlight <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "Other")
# Create the scatter plot
ggplot(top_10_users, aes(x = users, y = revenue, size = revenue, color = highlight)) +
geom_point(alpha = 0.7) + # Add transparency to the points
scale_size_continuous(range = c(3, 12), guide = "none") + # Hide size legend
scale_x_log10() + # Apply log10 transformation to the x-axis
scale_y_log10() + # Apply log10 transformation to the y-axis
scale_color_manual(values = c("(direct) / (none)" = "green", "google / cpc" = "blue", "Other" = "grey")) + # Custom colors
labs(x = "Users (log10)", y = "Revenue (log10)", color = "Source Medium") +
ggtitle("Scatter Plot Highlighting Two Source Mediums") +
theme_minimal() +
theme(legend.position = "right",
legend.text = element_text(size = 10), # Increase legend text size
legend.title = element_text(size = 10))
top_10_users <- top_10_users |> dplyr::filter(top_10_users$source_medium != "google / cpc")
highlight_sources <- c("youtube / social", "(direct) / (none)", "facebook / social")
top_10_users$highlight <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "Other")
ggplot(top_10_users, aes(x = users, y = revenue, size = revenue, color = highlight)) +
geom_point(alpha = 0.7) + # Add transparency
scale_size_continuous(range = c(3, 12), guide = "none") + # Hide size legend
scale_x_log10() + # Apply log10 transformation
scale_y_log10() + # Apply log10 transformation
scale_color_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "Other" = "grey")) + # Custom colors
labs(x = "Users (log10)", y = "Revenue (log10)", color = "Source Medium") +
ggtitle("Scatter Plot Highlighting Three Source Mediums") +
theme_minimal() +
theme(legend.position = "right",
legend.text = element_text(size = 10), # Increase legend text size
legend.title = element_text(size = 10))
# Create a new column to highlight only selected sources
top_10_users$highlight <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "Other")
ggplot(top_10_users, aes(x = new_users, y = revenue, size = revenue, color = highlight)) +
geom_point(alpha = 0.7) + # Add transparency
scale_size_continuous(range = c(3, 12), guide = "none") + # Hide size legend
scale_x_log10() + # Apply log10 transformation
scale_y_log10() + # Apply log10 transformation
scale_color_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "Other" = "grey")) + # Custom colors
labs(x = "New Users (log10)", y = "Revenue (log10)", color = "Source Medium") +
ggtitle("Scatter Plot Highlighting Three Source Mediums") +
theme_minimal() +
theme(legend.position = "right",
legend.text = element_text(size = 10), # Increase legend text size
legend.title = element_text(size = 10))
top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")
ggplot(top_10_users, aes(x = reorder(source_medium, percent_of_new_user), y = percent_of_new_user, fill = color)) +
geom_bar(stat = "identity") + # Bar plot
coord_flip() + # Flip coordinates for better readability
scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) + # Custom colors
labs(x = "Source Medium", y = "Percent of new users", title = "Percent of new users by Source Medium") +
theme_minimal() +
theme(legend.position = "none")
top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")
ggplot(top_10_users, aes(x = reorder(source_medium, bounce_rate), y = bounce_rate, fill = color)) +
geom_bar(stat = "identity") + # Bar plot
coord_flip() + # Flip coordinates for better readability
scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) + # Custom colors
labs(x = "Source Medium", y = "Bounce Rate (Lower is better)", title = "Bounce Rate by Source Medium") +
theme_minimal() +
theme(legend.position = "none")
top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")
ggplot(top_10_users, aes(x = reorder(source_medium, ecommerce_conversion_rate), y = ecommerce_conversion_rate, fill = color)) +
geom_bar(stat = "identity") + # Bar plot
coord_flip() + # Flip coordinates for better readability
scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) + # Custom colors
labs(x = "Source Medium", y = "Ecommerce Conversion Rate (Higher is better)", title = "Ecommerce Conversion Rate by Source Medium") +
theme_minimal() +
theme(legend.position = "none")
top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")
ggplot(top_10_users, aes(x = reorder(source_medium, avg_session_duration), y = avg_session_duration, fill = color)) +
geom_bar(stat = "identity") + # Bar plot
coord_flip() + # Flip coordinates for better readability
scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) + # Custom colors
labs(x = "Source Medium", y = "Average session duration (Higher is better)", title = "Average session duration by Source Medium") +
theme_minimal() +
theme(legend.position = "none")